All code used for the analyses can be found throughout this report by expanding the code-blocks in the .html-file on the right-hand side within the appropriate tabs, or through the associated .Rmd file, where it can also be run. You will have to install all the necessary packages yourself though, which can be non-trivial.
suppressPackageStartupMessages({
library('biomaRt') # downloading gene data
library("DT") # for pretty printing of paged tables
library("limma") # RNA-seq analysis
library("edgeR") # RNA-seq analysis
library("DESeq2") # RNA-seq analysis
library("here") # Better structure and relative paths
library("GO.db") # downloading gene data
library("org.Mm.eg.db")
library("cbmr") # in-house convenience functions
library("AnnotationDbi")
library("ggrepel")
# library("sva")
# library("DESeq2")
library("RUVSeq")
library("tidyverse") # general data manipulation
})
full_script_name <- rstudioapi::getSourceEditorContext()$path
script_name <- basename(full_script_name)
data_dir <- paste0(tools::file_path_sans_ext(full_script_name),"_data/")
dir.create(data_dir, showWarnings = F)
# load data
g_counts_raw <- readRDS("../data-raw/salmon.merged.gene_counts.rds")
rna_counts_all <- g_counts_raw %>% assays() %>% .$counts
# fix sample IDS in count matrix
colnames(rna_counts_all) <-
sub(pattern = "X[0-9]{4}_", replacement = "", x = colnames(rna_counts_all)) %>%
sub(pattern = "_S[0-9]{1,2}", replacement = "", x=.)
# fix sample IDS in meta data sheet
meta_data_sheet <- readxl::read_excel("0177_metadata.xlsx") %>%
transmute(`Sample.ID`=sub(pattern = ".....", replacement = "", x=`Sample ID (Library ID)`),
gRNA = factor(gsub(x = Condition1, pattern = "g|gRNA_", replacement = ""), levels=c("EV", "PTPRK", "PHOX2B")),
Media = factor(gsub(x = Condition2, pattern = "_media", replacement = ""), levels=c("proliferation","differentiation")),
Extraction=factor(`Extraction pool (A,B,C)`, levels=c("A", "B", "C", "D", "E")),
Replicate=factor(gsub(`Condition 3`, pattern="replicate_", replacement = ""), levels=c("1","2","3")),
Observed_batch=factor(Comments)) %>%
as.data.frame()
#Create DGE list of relevant samples
rna_counts <- rna_counts_all[, as.character(c(1:18))]
y <- DGEList(rna_counts,
group = meta_data_sheet$Observed_batch,
samples=meta_data_sheet)
design <- model.matrix(~0 + group, data=y$samples)
idx_expressed <- filterByExpr(y, design = design)
y <- y[idx_expressed, ]
y <- calcNormFactors(y)
# Calculate CPM and print
log_cpm <- cpm(y, log=T)
saveRDS(object =log_cpm, file=paste0(data_dir,
"log_CPM_values_ensembl_ID_by_sample.RDS"))
write_tsv(x = as.data.frame(log_cpm),
file=paste0(data_dir, "log_CPM_values_ensembl_ID_by_sample.tsv"))
# Get Description, external gene name via Ensembl/biomaRt
# mart <- useDataset("mmusculus_gene_ensembl", useMart("ensembl"))
# saveRDS(mart, file=paste0("shared_data/", "mouse_mart.RDS"))
mart <- readRDS(file = paste0("shared_data/", "mouse_mart.RDS"))
gene_info_list <- getBM(filters= "ensembl_gene_id",
attributes= c("ensembl_gene_id", "external_gene_name","description"),
values=rownames(y),
mart=mart)
# get names via org.Mm.eg.db
gene_names <- mapIds(x = org.Mm.eg.db,
keys = rownames(y),
column = "SYMBOL",
keytype = "ENSEMBL",
multiVals = "first")
## 'select()' returned 1:many mapping between keys and columns
# uncorrected
log_cpm_with_names <- log_cpm
rownames(log_cpm_with_names) <- gene_names
saveRDS(object=log_cpm_with_names, file = paste0(data_dir, "log_CPM_values_gene_name_by_sample.RDS"))
write_tsv(x = as.data.frame(log_cpm_with_names),
file =paste0(data_dir, "log_CPM_values_gene_name_by_sample.tsv"))
y <- estimateDisp(y, design = design)
fit <- glmQLFit(y, design = y$design, robust = TRUE)
contrast_matrix <- makeContrasts(batch_effect=groupA-groupB,
levels=design)
all_genes_test_full_info <- list()
for (contrast in colnames(contrast_matrix)) {
all_genes_test <- edgeR_tester(contrast_matrix = contrast_matrix, contrast=contrast, efit = fit)
all_genes_test_full_info[[contrast]] <- merge(x=all_genes_test,
y=gene_info_list,
by.x = "ENSEMBL", by.y="ensembl_gene_id",
all.x=T)
}
# print tsv-tables:
for (contrast in colnames(contrast_matrix)) {
all_genes_test_full_info[[contrast]] %>% write_tsv(file=paste0(data_dir, contrast,"_DGE_analysis.tsv"))
}
GO_terms <- get_enrichment_terms(org_db = org.Mm.eg.db, ensembl_ids = rownames(y),
min_genes = 5, max_genes = 500)$BP
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
ontology_tests <- run_ontology_tests(GO_terms,
y,
contrast_matrix = contrast_matrix,
fun=limma::camera)
## 70.1437066402379% of genes have an annotation
for (contrast in colnames(contrast_matrix)) {
ontology_tests[[contrast]] %>% write_tsv(file=paste0(data_dir, contrast,"_GO_analysis.tsv"))
}
sessionInfo()
## R version 4.1.1 (2021-08-10)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur 10.16
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] forcats_0.5.1 stringr_1.4.0
## [3] dplyr_1.0.7 purrr_0.3.4
## [5] readr_2.1.1 tidyr_1.1.4
## [7] tibble_3.1.6 tidyverse_1.3.1
## [9] RUVSeq_1.28.0 EDASeq_2.28.0
## [11] ShortRead_1.52.0 GenomicAlignments_1.30.0
## [13] Rsamtools_2.10.0 Biostrings_2.62.0
## [15] XVector_0.34.0 BiocParallel_1.28.3
## [17] ggrepel_0.9.1 ggplot2_3.3.5
## [19] cbmr_0.0.0.9001 org.Mm.eg.db_3.14.0
## [21] GO.db_3.14.0 AnnotationDbi_1.56.2
## [23] here_1.0.1 DESeq2_1.34.0
## [25] SummarizedExperiment_1.24.0 Biobase_2.54.0
## [27] MatrixGenerics_1.6.0 matrixStats_0.61.0
## [29] GenomicRanges_1.46.1 GenomeInfoDb_1.30.0
## [31] IRanges_2.28.0 S4Vectors_0.32.3
## [33] BiocGenerics_0.40.0 edgeR_3.36.0
## [35] limma_3.50.0 DT_0.20
## [37] biomaRt_2.50.1
##
## loaded via a namespace (and not attached):
## [1] readxl_1.3.1 backports_1.4.1 aroma.light_3.24.0
## [4] BiocFileCache_2.2.0 splines_4.1.1 digest_0.6.29
## [7] foreach_1.5.1 htmltools_0.5.2 fansi_0.5.0
## [10] magrittr_2.0.1 memoise_2.0.1 tzdb_0.2.0
## [13] annotate_1.72.0 modelr_0.1.8 vroom_1.5.7
## [16] R.utils_2.11.0 prettyunits_1.1.1 jpeg_0.1-9
## [19] colorspace_2.0-2 rvest_1.0.2 blob_1.2.2
## [22] rappdirs_0.3.3 haven_2.4.3 xfun_0.28
## [25] crayon_1.4.2 RCurl_1.98-1.5 jsonlite_1.7.2
## [28] genefilter_1.76.0 survival_3.2-13 iterators_1.0.13
## [31] glue_1.5.1 gtable_0.3.0 zlibbioc_1.40.0
## [34] DelayedArray_0.20.0 scales_1.1.1 DBI_1.1.1
## [37] Rcpp_1.0.7 xtable_1.8-4 progress_1.2.2
## [40] bit_4.0.4 htmlwidgets_1.5.4 httr_1.4.2
## [43] RColorBrewer_1.1-2 ellipsis_0.3.2 pkgconfig_2.0.3
## [46] XML_3.99-0.8 R.methodsS3_1.8.1 sass_0.4.0
## [49] dbplyr_2.1.1 locfit_1.5-9.4 utf8_1.2.2
## [52] tidyselect_1.1.1 rlang_0.4.12 munsell_0.5.0
## [55] cellranger_1.1.0 tools_4.1.1 cachem_1.0.6
## [58] cli_3.1.0 generics_0.1.1 RSQLite_2.2.9
## [61] broom_0.7.10 evaluate_0.14 fastmap_1.1.0
## [64] yaml_2.2.1 fs_1.5.2 knitr_1.36
## [67] bit64_4.0.5 KEGGREST_1.34.0 R.oo_1.24.0
## [70] xml2_1.3.3 compiler_4.1.1 rstudioapi_0.13
## [73] filelock_1.0.2 curl_4.3.2 png_0.1-7
## [76] reprex_2.0.1 statmod_1.4.36 geneplotter_1.72.0
## [79] bslib_0.3.1 stringi_1.7.6 GenomicFeatures_1.46.1
## [82] lattice_0.20-45 Matrix_1.4-0 vctrs_0.3.8
## [85] pillar_1.6.4 lifecycle_1.0.1 jquerylib_0.1.4
## [88] data.table_1.14.2 bitops_1.0-7 rtracklayer_1.54.0
## [91] R6_2.5.1 BiocIO_1.4.0 latticeExtra_0.6-29
## [94] hwriter_1.3.2.1 codetools_0.2-18 MASS_7.3-54
## [97] assertthat_0.2.1 rprojroot_2.0.2 rjson_0.2.20
## [100] withr_2.4.3 GenomeInfoDbData_1.2.7 parallel_4.1.1
## [103] hms_1.1.1 grid_4.1.1 rmarkdown_2.11
## [106] lubridate_1.8.0 restfulr_0.0.13
n_samples <- nrow(meta_data_sheet)
n_genes <-nrow(y)
n_counts_per_sample <- colSums(rna_counts)
n_counts_total <- sum(n_counts_per_sample)
n_counts_per_gene_per_sample <- (n_counts_total/n_genes)/n_samples
From the sequencing, we obtain 16144 genes that are expressed in a appreciable amount across the samples to be used for our downstream analyses. With a final, filtered transcript count of 317 million across the 18 samples, we have an average of 1091 RNA-transcripts for each gene in each sample.
Check out the tabs below to see various QC-visualizations of the data:
The images below shows heatmaps of the log-normalized distances between each possible sample-pair with more similar pairs having negative values, and less similar pairs having positive values. Here, we can see that the samples seem to cluster based on genotype, but not as much on treatment.
annotation_col_df <- dplyr::select(meta_data_sheet, gRNA, Extraction)
rownames(annotation_col_df) <- meta_data_sheet$Sample.ID
annotation_row_df <- dplyr::select(meta_data_sheet, Media)
rownames(annotation_row_df) <- meta_data_sheet$Sample.ID
plot_sample_heatmap(log_cpm,
method="MDS",
labels_col = meta_data_sheet$Sample.ID,
labels_row = meta_data_sheet$Sample.ID,
annotation_col = annotation_col_df[c("gRNA", "Extraction")],
annotation_row = annotation_row_df["Media"],
display_numbers=T,
fontsize_number=6,
main="Multiple dimensional scaling (MDS) heatmap"
)
The figures below show the location of the samples in reduced 2-dimensional space.
ggplot_mds(y = y,
dim_plot = c(1,2),
size=1,
colour_by = y$samples$Media) +
ggtitle("MDS-plot colored by Media (Dimensions 1 & 2)") +
aes(label = rownames(y$samples)) +
geom_label_repel()
ggplot_mds(y = y,
dim_plot = c(3,4),
size=1,
colour_by = y$samples$Media) +
ggtitle("MDS-plot colored by Media (Dimensions 3 & 4)") +
aes(label = rownames(y$samples)) +
geom_label_repel()
ggplot_mds(y = y,
dim_plot = c(1,2),
colour_by = y$samples$gRNA,
size=1) +
ggtitle("MDS-plot colored by gRNA") +
aes(label = rownames(y$samples)) +
geom_label_repel()
The plots below show the log2-fold change in gene expression for all genes in each sample, compared to the rest of the samples. Here, we see the typical distribution with only a few outliers.
withr::with_par(list(mfrow = c(2, 3)), {
for (i in seq_len(ncol(y))) {
edgeR::plotMD.DGEList(y, column = i,cex.lab=1)
graphics::abline(h = 0)
}
})
Throughout the analysis-section, you will see tables containing the differential gene expression analysis results as well as the gene-ontology enrichment results for the different contrasts, or groupings, as outlined below:
Below is shown the resulting, unadjusted P-value distribution from differential gene expression testing for each contrast:
withr::with_par(list(mfrow = c(1,3)),
{for (contrast in colnames(contrast_matrix)) {
hist(all_genes_test_full_info[[contrast]]$PValue,
main = contrast, xlab = "P-value") }}
)
In the following sections, you can order the rows in the tables as you wish by directly interacting with the tables. Similarly, you can search for specific genes or GO-terms, limit the data-ranges and download the data as excel and csv-files. (Copies of the full tables are already found in the corresponding data directory, since searching in the full tables here can be a bit slow).
The gene ontology analysis investigates if some gene ontology terms are over-represented among the up-regulated and down-regulated genes respectively to investigate if specific biological processes are up- or down-regulated.
all_genes_test_full_info[["batch_effect"]] %>%
filter(FDR<0.05) %>%
transmute(Gene = external_gene_name,
ENSEMBL_ID = rownames(all_genes_test_full_info),
description,
logFC=round(logFC,2),
PValue=round(PValue,2),
FDR=round(FDR,5)) %>%
DT::datatable(extensions = 'Buttons', filter="top", rownames = FALSE, options = list(
dom = 'Bfrtip',buttons = c('copy', 'csv', 'excel', 'colvis')))
all_genes_test_full_info[["batch_effect"]] %>%
transmute(Gene = external_gene_name,
ENSEMBL_ID = rownames(all_genes_test_full_info),
description,
logFC=round(logFC,2),
PValue=round(PValue,2),
FDR=round(FDR,5)) %>%
DT::datatable(extensions = 'Buttons', filter="top", rownames = FALSE, options = list(
dom = 'Bfrtip',buttons = c('copy', 'csv', 'excel', 'colvis')))
plot_volcano <- function(contrast){
df <- all_genes_test_full_info[[contrast]]
allLogFc <- df[["logFC"]]
minLogFc <- min(allLogFc)
maxLogFc <- max(allLogFc)
minPval <- max(-log10(df[["PValue"]]))
maxPval <- 0
ggplot(df, aes(x = logFC,
y = -log10(PValue),
colour = FDR < 0.05
)
) +
geom_point(size=0.5, alpha=0.2) +
scale_color_manual(values = c("TRUE" = "red", "FALSE" = "black"), name="Significance", labels = c("TRUE" = "FDR < 0.05", "FALSE" = "FDR ≥ 0.05"), breaks = c("TRUE", "FALSE")) +
ylab(expression(paste(-log[10], "(P-value)"))) +
xlab(expression(paste(log[2], "(Fold Change)"))) +
scale_x_continuous(limits = c(minLogFc, maxLogFc)) +
scale_y_continuous(limits = c(maxPval, minPval)) +
theme_bw()
}
plot_volcano("batch_effect")
ontology_tests[["batch_effect"]] %>%
filter(FDR<0.05) %>%
transmute(ID,
Direction=as.factor(Direction),
TERM,
`#genes`=NGenes,
PValue=round(PValue,2),
FDR=round(FDR, 5)) %>%
DT::datatable(extensions = 'Buttons', filter="top", rownames = FALSE, options = list(
dom = 'Bfrtip',buttons = c('copy', 'csv', 'excel', 'colvis')))
ontology_tests[["batch_effect"]] %>%
transmute(ID, Direction=as.factor(Direction), TERM, `#genes`=NGenes, PValue=round(PValue,2),FDR=round(FDR, 5)) %>%
DT::datatable(extensions = 'Buttons', filter="top", rownames = FALSE, options = list(
dom = 'Bfrtip',buttons = c('copy', 'csv', 'excel', 'colvis')))